{
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7-final"
  },
  "orig_nbformat": 2,
  "kernelspec": {
   "name": "python3",
   "display_name": "DataAnalysis"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2,
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "import nltk\n",
    "%matplotlib inline"
   ]
  },
  {
   "source": [
    "# Chap4 编写结构化的程序\n",
    "1.  怎样才能写出结构良好，可读性强的程序，从而方便重用？\n",
    "2.  基本的结构块，例如：循环、函数和赋值是如何执行的？\n",
    "3.  Python 编程的陷阱还有哪些，如何避免它们？"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "source": [
    "## 4.3 风格的问题(P152)"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "source": [
    "### 4.3.1 Python代码的风格\n",
    "\n",
    "Python 代码风格指南: http://www.python.org/dev/peps/pep-0008/"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.corpus import brown\n",
    "\n",
    "rotokas_words = nltk.corpus.toolbox.words('rotokas.dic')\n",
    "\n",
    "cv_words_pairs = [\n",
    "        (cv, w)\n",
    "        for w in rotokas_words\n",
    "        for cv in re.findall('[ptksvr][aeiou]', w)\n",
    "]\n",
    "cfd = nltk.ConditionalFreqDist(\n",
    "        (genre, word)\n",
    "        for genre in brown.categories()\n",
    "        for word in brown.words(categories=genre)\n",
    ")\n",
    "\n",
    "ha_words = [\n",
    "        'aaahhhh', 'ah', 'ahah', 'ahahah', 'ahh', 'ahhahahaha',\n",
    "        'ahhh', 'ahhhh', 'ahhhhhh', 'ahhhhhhhh', 'ha', 'haaa',\n",
    "        'hah', 'haha', 'hahaaa', 'hahah', 'hahaha'\n",
    "]\n",
    "\n",
    "syllables = []\n",
    "\n",
    "\n",
    "def process(aList):\n",
    "    # process sth.\n",
    "    return\n",
    "\n",
    "\n",
    "if (\n",
    "        len(syllables) > 4\n",
    "        and len(syllables[2]) == 3\n",
    "        and syllables[2][2] in ['a', 'e', 'i', 'o', 'u']\n",
    "        and syllables[2][3] == syllables[1][3]\n",
    "):\n",
    "    process(syllables)"
   ]
  },
  {
   "source": [
    "### 4.3.2 过程风格 与 声明风格(P153)\n",
    "统计布朗语料库中词的平均长度"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "total / count=4.402\n"
     ]
    }
   ],
   "source": [
    "# 使用 for 循环\n",
    "tokens = nltk.corpus.brown.words(categories='news')\n",
    "count = 0\n",
    "total = 0\n",
    "for token in tokens:\n",
    "    count += 1\n",
    "    total += len(token)\n",
    "print('total / count={:.3f}'.format(total / count))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "total / count=4.402\n"
     ]
    }
   ],
   "source": [
    "# 使用生成器表达式\n",
    "token_list = [\n",
    "        len(t)\n",
    "        for t in tokens\n",
    "]\n",
    "total = sum(\n",
    "        len(t)\n",
    "        for t in tokens\n",
    ")\n",
    "print('total / count={:.3f}'.format(total / len(tokens)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "慢速代码\n",
      "['$1,000', '$10', '$100', '$12', '$15,000,000', '$157,460', '$3', '$30', '$4', '$451,500', '$5,000,000', '$50', '$88,000', '&', \"'\", \"''\", '(', ')', ',', '--', '.', '1', '1,119', '10', '100,000', '114', '12', '13', '13th', '150', '17', '17,000', '18', '182', '1913', '1923', '1937', '1958', '1961', '1961-62', '1962', '2', '22', '24', '250', '29-5', '3', '30', '300,000', '31', '4', '4-year', '4.4', '402', '50', '6', '63', '637', '65', '71', '74', '8', '81', '87-31', ':', '?', 'A', 'A.', 'ADC', 'Acting', 'Affairs', 'After', 'Agency', 'Aikin', 'Aj']\n",
      "快速代码\n",
      "['$1,000', '$10', '$100', '$12', '$15,000,000', '$157,460', '$3', '$30', '$4', '$451,500', '$5,000,000', '$50', '$88,000', '&', \"'\", \"''\", '(', ')', ',', '--', '.', '1', '1,119', '10', '100,000', '114', '12', '13', '13th', '150', '17', '17,000', '18', '182', '1913', '1923', '1937', '1958', '1961', '1961-62', '1962', '2', '22', '24', '250', '29-5', '3', '30', '300,000', '31', '4', '4-year', '4.4', '402', '50', '6', '63', '637', '65', '71', '74', '8', '81', '87-31', ':', '?', 'A', 'A.', 'ADC', 'Acting', 'Affairs', 'After', 'Agency', 'Aikin', 'Aj']\n"
     ]
    }
   ],
   "source": [
    "# 对单词排序\n",
    "print(\"慢速代码\")\n",
    "word_list = []\n",
    "i = 0\n",
    "while i < len(tokens[:5000]):\n",
    "    j = 0\n",
    "    while j < len(word_list) and word_list[j] <= tokens[i]:\n",
    "        j += 1\n",
    "    if j == 0 or tokens[i] != word_list[j - 1]:\n",
    "        word_list.insert(j, tokens[i])\n",
    "    i += 1\n",
    "print(word_list[:75])\n",
    "\n",
    "# 下面是等效的代码，代码更简洁，速度更快\n",
    "print(\"快速代码\")\n",
    "word_list = sorted(set(tokens[:5000]))\n",
    "print(word_list[:75])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "  1   5.40% the\n  2  10.42% ,\n  3  14.67% .\n  4  17.78% of\n  5  20.19% and\n  6  22.40% to\n  7  24.29% a\n  8  25.97% in\n"
     ]
    }
   ],
   "source": [
    "# 统计布朗语料库中单词占比数，超过25%后停止输出\n",
    "fd = nltk.FreqDist(nltk.corpus.brown.words())\n",
    "cumulative = 0.0\n",
    "most_common_words = [word for (word, count) in fd.most_common()]\n",
    "for rank, word in enumerate(most_common_words):\n",
    "    cumulative += fd.freq(word)  # word在总文本中的占比数\n",
    "    print(\"%3d %6.2f%% %s\" % (rank + 1, cumulative * 100, word))\n",
    "    if cumulative > 0.25:\n",
    "        break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "longest word:unextinguishable\n",
      "['unextinguishable', 'transubstantiate', 'inextinguishable', 'incomprehensible']\n"
     ]
    }
   ],
   "source": [
    "# P155 寻找最长的单词\n",
    "# 第一种方法：只能找到第一个长度最长的词\n",
    "text = nltk.corpus.gutenberg.words('milton-paradise.txt')\n",
    "longest = ''\n",
    "for word in text:\n",
    "    if len(word) > len(longest):\n",
    "        longest = word\n",
    "print('longest word:{}'.format(longest))\n",
    "\n",
    "# 下面是等效的代码\n",
    "# 第二种方法：使用两个链表推导式，可以找到所有最长的词\n",
    "maxlen = max(len(word) for word in text)\n",
    "print([word for word in text if len(word) == maxlen])"
   ]
  },
  {
   "source": [
    "### 4.3.3 计数器（counter）的常规用法"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "3-grams=  [['The', 'dog', 'gave'], ['dog', 'gave', 'John'], ['gave', 'John', 'the'], ['John', 'the', 'newspaper']]\n3-grams=  [('The', 'dog', 'gave'), ('dog', 'gave', 'John'), ('gave', 'John', 'the'), ('John', 'the', 'newspaper')]\n2-grams=  [('The', 'dog'), ('dog', 'gave'), ('gave', 'John'), ('John', 'the'), ('the', 'newspaper')]\n4-grams=  [('The', 'dog', 'gave', 'John'), ('dog', 'gave', 'John', 'the'), ('gave', 'John', 'the', 'newspaper')]\n"
     ]
    }
   ],
   "source": [
    "# 使用循环变量来提取链表中连续重叠的3-grams\n",
    "n = 3\n",
    "sent = ['The', 'dog', 'gave', 'John', 'the', 'newspaper']\n",
    "print(\"3-grams= \", [sent[i:i + n] for i in range(len(sent) - n + 1)])\n",
    "# 下面是等效的代码\n",
    "print(\"3-grams= \", list(nltk.trigrams(sent)))\n",
    "# 下面是 2-grams\n",
    "print(\"2-grams= \", list(nltk.bigrams(sent)))\n",
    "# 下面是 4-grams\n",
    "print(\"4-grams= \", list(nltk.ngrams(sent, 4)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "[[set(), set(), set(), set(), set(), set(), set()],\n [set(), set(), set(), set(), set(), set(), set()],\n [set(), set(), set(), set(), set(), {'Alice'}, set()]]\n"
     ]
    }
   ],
   "source": [
    "import pprint\n",
    "\n",
    "# 使用循环变量构建多维结构\n",
    "# 嵌套的链表推导式\n",
    "m, n = 3, 7\n",
    "array = [\n",
    "        [\n",
    "                set()\n",
    "                for i in range(n)\n",
    "        ]\n",
    "        for j in range(m)\n",
    "]\n",
    "array[2][5].add('Alice')\n",
    "pprint.pprint(array)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "[[{7}, {7}, {7}, {7}, {7}, {7}, {7}],\n [{7}, {7}, {7}, {7}, {7}, {7}, {7}],\n [{7}, {7}, {7}, {7}, {7}, {7}, {7}]]\n"
     ]
    }
   ],
   "source": [
    "array = [[set()] * n] * m\n",
    "array[2][5].add(7)\n",
    "pprint.pprint(array)"
   ]
  }
 ]
}